In [63]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime

import os
path_dir = os.path.dirname(os.getcwd())

import plotly.graph_objects as go
import plotly.express as px
import plotly.io as pio
pio.templates.default = "plotly_white"

plotly.offline.init_notebook_mode()
  • Source: https://www.kaggle.com/datasets/gsagar12/dspp1

1. Data loading & preparation¶

In [64]:
path_dir = os.path.join(os.path.dirname(os.getcwd()), 'datasets', 'customer_subscription')
os.listdir(path_dir)
Out[64]:
['product_info.csv',
 'customer_cases.csv',
 'customer_product.csv',
 'customer_info.csv']

1.1. Customer cases¶

In [65]:
# loading data
customer_cases = pd.read_csv(os.path.join(path_dir, "customer_cases.csv"))

# parsing dates
customer_cases['date_time'] = pd.to_datetime(customer_cases['date_time']) 

customer_cases.head()
Out[65]:
Unnamed: 0 case_id date_time customer_id channel reason
0 1 CC101 2017-01-01 10:32:03 C2448 phone signup
1 2 CC102 2017-01-01 11:35:47 C2449 phone signup
2 3 CC103 2017-01-01 11:37:09 C2450 phone signup
3 4 CC104 2017-01-01 13:28:14 C2451 phone signup
4 5 CC105 2017-01-01 13:52:22 C2452 phone signup

1.2. Customer product¶

In [66]:
# loading data
customer_product = pd.read_csv(os.path.join(path_dir, "customer_product.csv"))

# parsing dates
customer_product['signup_date_time'] = pd.to_datetime(customer_product['signup_date_time'])
customer_product['cancel_date_time'] = pd.to_datetime(customer_product['cancel_date_time'])

customer_product.head()
Out[66]:
Unnamed: 0 customer_id product signup_date_time cancel_date_time
0 1 C2448 prd_1 2017-01-01 10:35:09 NaT
1 2 C2449 prd_1 2017-01-01 11:39:29 2021-09-05 10:00:02
2 3 C2450 prd_1 2017-01-01 11:42:00 2019-01-13 16:24:55
3 4 C2451 prd_2 2017-01-01 13:32:08 NaT
4 5 C2452 prd_1 2017-01-01 13:57:30 2021-06-28 18:06:01
In [67]:
# CHECK: 1 product per customer
customer_product.groupby(['customer_id']).agg({'product': 'nunique'}).max()
Out[67]:
product    1
dtype: int64
In [68]:
# merging data

df = customer_cases.merge(customer_product, on=['customer_id'], how='left')\
    .drop(['Unnamed: 0_x', 'Unnamed: 0_y'], axis=1)

df.head()
Out[68]:
case_id date_time customer_id channel reason product signup_date_time cancel_date_time
0 CC101 2017-01-01 10:32:03 C2448 phone signup prd_1 2017-01-01 10:35:09 NaT
1 CC102 2017-01-01 11:35:47 C2449 phone signup prd_1 2017-01-01 11:39:29 2021-09-05 10:00:02
2 CC103 2017-01-01 11:37:09 C2450 phone signup prd_1 2017-01-01 11:42:00 2019-01-13 16:24:55
3 CC104 2017-01-01 13:28:14 C2451 phone signup prd_2 2017-01-01 13:32:08 NaT
4 CC105 2017-01-01 13:52:22 C2452 phone signup prd_1 2017-01-01 13:57:30 2021-06-28 18:06:01

1.3. Product info¶

In [69]:
# loading data
product_info = pd.read_csv(os.path.join(path_dir, "product_info.csv"))
product_info.head()
Out[69]:
product_id name price billing_cycle
0 prd_1 annual_subscription 1200 12
1 prd_2 monthly_subscription 125 1
In [70]:
# merging data

df = df.merge(
    product_info.rename(columns={'product_id': 'product'}), 
    on=['product'], how='left')

df.head()
Out[70]:
case_id date_time customer_id channel reason product signup_date_time cancel_date_time name price billing_cycle
0 CC101 2017-01-01 10:32:03 C2448 phone signup prd_1 2017-01-01 10:35:09 NaT annual_subscription 1200 12
1 CC102 2017-01-01 11:35:47 C2449 phone signup prd_1 2017-01-01 11:39:29 2021-09-05 10:00:02 annual_subscription 1200 12
2 CC103 2017-01-01 11:37:09 C2450 phone signup prd_1 2017-01-01 11:42:00 2019-01-13 16:24:55 annual_subscription 1200 12
3 CC104 2017-01-01 13:28:14 C2451 phone signup prd_2 2017-01-01 13:32:08 NaT monthly_subscription 125 1
4 CC105 2017-01-01 13:52:22 C2452 phone signup prd_1 2017-01-01 13:57:30 2021-06-28 18:06:01 annual_subscription 1200 12

1.4. Customer info¶

In [71]:
customer_info = pd.read_csv(os.path.join(path_dir, "customer_info.csv"))
customer_info.head()
Out[71]:
Unnamed: 0 customer_id age gender
0 1 C2448 76 female
1 2 C2449 61 male
2 3 C2450 58 female
3 4 C2451 62 female
4 5 C2452 71 male
In [72]:
# merging data

df = df.merge(
    customer_info.drop('Unnamed: 0',axis=1), 
    on=['customer_id']
)

df.head()
Out[72]:
case_id date_time customer_id channel reason product signup_date_time cancel_date_time name price billing_cycle age gender
0 CC101 2017-01-01 10:32:03 C2448 phone signup prd_1 2017-01-01 10:35:09 NaT annual_subscription 1200 12 76 female
1 CC102 2017-01-01 11:35:47 C2449 phone signup prd_1 2017-01-01 11:39:29 2021-09-05 10:00:02 annual_subscription 1200 12 61 male
2 CC103 2017-01-01 11:37:09 C2450 phone signup prd_1 2017-01-01 11:42:00 2019-01-13 16:24:55 annual_subscription 1200 12 58 female
3 CC104 2017-01-01 13:28:14 C2451 phone signup prd_2 2017-01-01 13:32:08 NaT monthly_subscription 125 1 62 female
4 CC4491 2017-03-31 12:06:58 C2451 phone support prd_2 2017-01-01 13:32:08 NaT monthly_subscription 125 1 62 female

2. Feature engineering¶

2.1. Target¶

In [73]:
date_max = max(df.cancel_date_time.max(), df.signup_date_time.max(), df.date_time.max())
date_max
Out[73]:
Timestamp('2022-01-01 06:32:53')
In [74]:
df['duration'] = (df['cancel_date_time'] - df['date_time']).dt.days
df['censored'] = df['duration'].isna().astype(int)
df.loc[df.censored==1, "duration"] = (date_max - df.loc[df.censored==1, "date_time"]).dt.days
In [76]:
# deleting data where customers reach out after they unsubscribe.
df = df[df.duration >0]
In [97]:
df.censored.mean()
Out[97]:
0.7947466731855306

2.2. Categorical variable encoding¶

In [77]:
df['age_bin'] = df.age.apply(lambda x:"[{},{}[".format(x//10*10, (x//10+1)*10))
In [78]:
df['product=prd_1'] = df['product'].map({'prd_1':1, 'prd_2':0})
df['gender=female'] = df.gender.map({'female':1, 'male':0})
df['channel=email'] = df.channel.map({'phone':0, 'email':1})
df['reason=support'] = df.reason.map({'signup':0, 'support':1})

2.3. Enrichment¶

In [79]:
# create a column containing, for a given observation, 
# the number of times the client has reach out before

def get_nb_cases(patient_df):
    patient_df['nb_cases'] = [i for i in range(patient_df.shape[0])]
    return patient_df

df = df.groupby('customer_id').apply(lambda df : get_nb_cases(df))
In [80]:
# yearly seasonlity
df['date_month_cos'] = df['date_time'].apply(lambda x: np.cos(2*np.pi*(x.month-1)/12))
df['date_month_sin'] = df['date_time'].apply(lambda x: np.sin(2*np.pi*(x.month-1)/12))

# weekly seasonlity
df['date_weekday_cos'] = df['date_time'].apply(lambda x: np.cos(2*np.pi*x.dayofweek/6))
df['date_weekday_sin'] = df['date_time'].apply(lambda x: np.sin(2*np.pi*x.dayofweek/6))

# daily seasonlity
df['date_hour_cos'] = df['date_time'].apply(lambda x: np.cos(2*np.pi*x.hour/24))
df['date_hour_sin'] = df['date_time'].apply(lambda x: np.sin(2*np.pi*x.hour/24))

3. Data exploration¶

In [81]:
px.histogram(df, x='duration')
In [82]:
px.box(df, y="duration", x='age_bin')
In [94]:
for c in ['channel', 'reason', 'name']:
    px.box(df, y="duration", x=c, width=500, height = 500).show()
In [85]:
px.imshow(df.corr())

5. Saving¶

In [ ]:
df.drop(['reason', 'product', 'gender', 'channel'], axis=1, inplace=True)
In [ ]:
df.to_csv(os.path.join(os.path.dirname(os.getcwd()), "outputs/customer_subscription_clean.csv"), index=False)